library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)2015 AAQoL: Machine Learning Approaches
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
There are 2,609 responses, some with missing data.
Machine Learning Classifications
We are going to analyze the prediction accuracy of different machine learning algorithms in predicting the source of health information, healthcare utilization outcomes, and insurance.
We will consider the following algorithms: - Random Forest - Logistic Regression
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(Family~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Family ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 44.38%
Confusion matrix:
No Yes class.error
No 431 359 0.4544304
Yes 360 470 0.4337349
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 666 93
Yes 124 737
Accuracy : 0.866
95% CI : (0.8485, 0.8823)
No Information Rate : 0.5123
P-Value [Acc > NIR] : <2e-16
Kappa : 0.7317
Mcnemar's Test P-Value : 0.0417
Sensitivity : 0.8880
Specificity : 0.8430
Pos Pred Value : 0.8560
Neg Pred Value : 0.8775
Prevalence : 0.5123
Detection Rate : 0.4549
Detection Prevalence : 0.5315
Balanced Accuracy : 0.8655
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 175 126
Yes 179 225
Accuracy : 0.5674
95% CI : (0.5299, 0.6043)
No Information Rate : 0.5021
P-Value [Acc > NIR] : 0.0002993
Kappa : 0.1353
Mcnemar's Test P-Value : 0.0029060
Sensitivity : 0.6410
Specificity : 0.4944
Pos Pred Value : 0.5569
Neg Pred Value : 0.5814
Prevalence : 0.4979
Detection Rate : 0.3191
Detection Prevalence : 0.5730
Balanced Accuracy : 0.5677
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$Family,pred_noeth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
set.seed(222)
ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
train <- rfdata[ind==1,]
test <- rfdata[ind==2,]
randomForest::randomForest(Family~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Family ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 41.85%
Confusion matrix:
No Yes class.error
No 415 380 0.4779874
Yes 298 527 0.3612121
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 682 65
Yes 113 760
Accuracy : 0.8901
95% CI : (0.8739, 0.9049)
No Information Rate : 0.5093
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7799
Mcnemar's Test P-Value : 0.000427
Sensitivity : 0.9212
Specificity : 0.8579
Pos Pred Value : 0.8706
Neg Pred Value : 0.9130
Prevalence : 0.5093
Detection Rate : 0.4691
Detection Prevalence : 0.5389
Balanced Accuracy : 0.8895
'Positive' Class : Yes
Test Set
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 189 141
Yes 160 215
Accuracy : 0.573
95% CI : (0.5356, 0.6099)
No Information Rate : 0.505
P-Value [Acc > NIR] : 0.0001688
Kappa : 0.1456
Mcnemar's Test P-Value : 0.2995016
Sensitivity : 0.6039
Specificity : 0.5415
Pos Pred Value : 0.5733
Neg Pred Value : 0.5727
Prevalence : 0.5050
Detection Rate : 0.3050
Detection Prevalence : 0.5319
Balanced Accuracy : 0.5727
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$Family,pred_eth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.62
pROC::auc(rocobj_wo)Area under the curve: 0.6133
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) No Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 8.920852 8.192125 14.727942 61.46730
Age 16.716892 17.329953 26.802343 163.45942
Gender 7.031115 7.664399 11.795893 24.35801
Religion 4.661724 7.550253 10.617795 68.66222
Employment 4.883392 4.851916 9.041486 19.42635
Income_median 9.702889 -1.012253 7.068824 22.19238
EnglishSpeak 7.260349 7.690340 12.201893 45.24792
EnglishDiff 13.430754 8.756692 16.561041 54.76251
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(Family~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 463 347
Yes 332 478
Accuracy : 0.5809
95% CI : (0.5564, 0.605)
No Information Rate : 0.5093
P-Value [Acc > NIR] : 4.34e-09
Kappa : 0.1617
Mcnemar's Test P-Value : 0.5911
Sensitivity : 0.5794
Specificity : 0.5824
Pos Pred Value : 0.5901
Neg Pred Value : 0.5716
Prevalence : 0.5093
Detection Rate : 0.2951
Detection Prevalence : 0.5000
Balanced Accuracy : 0.5809
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 197 159
Yes 152 197
Accuracy : 0.5589
95% CI : (0.5213, 0.5959)
No Information Rate : 0.505
P-Value [Acc > NIR] : 0.002342
Kappa : 0.1178
Mcnemar's Test P-Value : 0.733684
Sensitivity : 0.5534
Specificity : 0.5645
Pos Pred Value : 0.5645
Neg Pred Value : 0.5534
Prevalence : 0.5050
Detection Rate : 0.2794
Detection Prevalence : 0.4950
Balanced Accuracy : 0.5589
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$Family,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.5773
With ethnicity
Training Set
mod1w <- glm(Family~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1w,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 466 300
Yes 329 525
Accuracy : 0.6117
95% CI : (0.5875, 0.6355)
No Information Rate : 0.5093
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2227
Mcnemar's Test P-Value : 0.2642
Sensitivity : 0.6364
Specificity : 0.5862
Pos Pred Value : 0.6148
Neg Pred Value : 0.6084
Prevalence : 0.5093
Detection Rate : 0.3241
Detection Prevalence : 0.5272
Balanced Accuracy : 0.6113
'Positive' Class : Yes
Test Set
predict(mod1w,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 203 149
Yes 146 207
Accuracy : 0.5816
95% CI : (0.5442, 0.6183)
No Information Rate : 0.505
P-Value [Acc > NIR] : 2.681e-05
Kappa : 0.1631
Mcnemar's Test P-Value : 0.9073
Sensitivity : 0.5815
Specificity : 0.5817
Pos Pred Value : 0.5864
Neg Pred Value : 0.5767
Prevalence : 0.5050
Detection Rate : 0.2936
Detection Prevalence : 0.5007
Balanced Accuracy : 0.5816
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$Family,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.614
10-fold cross validated logistic regression
Without Ethnicity
x <- model.matrix(Family~Age+Gender+Religion + Income_median + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train)
mod <- cv.glmnet(x,train$Family,family="binomial",type.measure="class")
plot(mod)
print(mod)
Call: cv.glmnet(x = x, y = train$Family, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.008431 20 0.4296 0.01353 11
1se 0.025746 8 0.4401 0.00987 4
x_test <- model.matrix(Family~Age+Gender+Religion + Income_median + Income_median +Employment+EnglishSpeak+EnglishDiff,data=test)
predict(mod,newx=x_test,type="response")-> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 200 166
Yes 149 190
Accuracy : 0.5532
95% CI : (0.5156, 0.5903)
No Information Rate : 0.505
P-Value [Acc > NIR] : 0.005772
Kappa : 0.1067
Mcnemar's Test P-Value : 0.367324
Sensitivity : 0.5337
Specificity : 0.5731
Pos Pred Value : 0.5605
Neg Pred Value : 0.5464
Prevalence : 0.5050
Detection Rate : 0.2695
Detection Prevalence : 0.4809
Balanced Accuracy : 0.5534
'Positive' Class : Yes
With Ethnicity
x <- model.matrix(Family~Age+Ethnicity+Gender+Religion + Income_median + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train)
mod <- cv.glmnet(x,train$Family,family="binomial",type.measure="class")
plot(mod)
print(mod)
Call: cv.glmnet(x = x, y = train$Family, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.006378 23 0.4074 0.01104 16
1se 0.019476 11 0.4179 0.01353 8
x_test <- model.matrix(Family~Age+Ethnicity+Gender+Religion + Income_median + Income_median +Employment+EnglishSpeak+EnglishDiff,data=test)
predict(mod,newx=x_test,type="response")-> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 182 152
Yes 167 204
Accuracy : 0.5475
95% CI : (0.5099, 0.5847)
No Information Rate : 0.505
P-Value [Acc > NIR] : 0.01308
Kappa : 0.0946
Mcnemar's Test P-Value : 0.43313
Sensitivity : 0.5730
Specificity : 0.5215
Pos Pred Value : 0.5499
Neg Pred Value : 0.5449
Prevalence : 0.5050
Detection Rate : 0.2894
Detection Prevalence : 0.5262
Balanced Accuracy : 0.5473
'Positive' Class : Yes
Logistic Regression (ANOVA)
glm(Family~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(Family~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Family ~ Age + Ethnicity + Gender + Religion +
Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.412513 0.310144 4.554 5.25e-06 ***
Age -0.015234 0.002879 -5.291 1.21e-07 ***
EthnicityAsian Indian -0.253127 0.258076 -0.981 0.326679
EthnicityFilipino -0.206671 0.196541 -1.052 0.293011
EthnicityKorean -0.629756 0.147117 -4.281 1.86e-05 ***
EthnicityOther -0.778697 0.217292 -3.584 0.000339 ***
EthnicityVietnamese -0.768876 0.156059 -4.927 8.36e-07 ***
GenderMale 0.277602 0.090337 3.073 0.002119 **
ReligionCatholic 0.008662 0.166002 0.052 0.958386
ReligionHindu -0.228203 0.278141 -0.820 0.411955
ReligionMuslim -0.180373 0.333239 -0.541 0.588320
ReligionNone -0.161396 0.168213 -0.959 0.337322
ReligionOther -0.577008 0.356426 -1.619 0.105475
ReligionProtestant -0.071672 0.168970 -0.424 0.671440
Income_medianAbove -0.162998 0.093930 -1.735 0.082684 .
EmploymentEmployed full time -0.422889 0.093189 -4.538 5.68e-06 ***
EnglishSpeakNot well -0.161232 0.222389 -0.725 0.468453
EnglishSpeakVery well -0.022743 0.234261 -0.097 0.922659
EnglishSpeakWell -0.238970 0.222822 -1.072 0.283509
EnglishDiffMuch 0.379450 0.143168 2.650 0.008040 **
EnglishDiffNot much -0.006446 0.133190 -0.048 0.961402
EnglishDiffVery much -0.255610 0.127932 -1.998 0.045715 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3222.5 on 2324 degrees of freedom
Residual deviance: 3097.2 on 2303 degrees of freedom
AIC: 3141.2
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Family
LR Chisq Df Pr(>Chisq)
Age 28.356 1 1.009e-07 ***
Ethnicity 40.633 5 1.113e-07 ***
Gender 9.494 1 0.0020613 **
Religion 3.965 6 0.6814028
Income_median 3.014 1 0.0825692 .
Employment 20.743 1 5.254e-06 ***
EnglishSpeak 3.845 3 0.2787365
EnglishDiff 20.193 3 0.0001548 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.212370 1 1.101077
Ethnicity 14.584443 5 1.307341
Gender 1.117560 1 1.057147
Religion 12.852793 6 1.237133
Income_median 1.214367 1 1.101983
Employment 1.191594 1 1.091602
EnglishSpeak 2.366268 3 1.154367
EnglishDiff 1.802170 3 1.103145
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Family ~ Age + Gender + Religion + Income_median + Employment +
EnglishSpeak + EnglishDiff
Model 2: Family ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2308 3137.8
2 2303 3097.2 5 40.633 1.113e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 3171.848 3141.215
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 3269.623 3267.748
Source of Information: Health Professional
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
set.seed(222)
pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`=="No")
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Heal Professionals`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Heal Professionals` ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 40.25%
Confusion matrix:
No Yes class.error
No 449 341 0.4316456
Yes 311 519 0.3746988
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 623 137
Yes 167 693
Accuracy : 0.8123
95% CI : (0.7925, 0.8311)
No Information Rate : 0.5123
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.6241
Mcnemar's Test P-Value : 0.09626
Sensitivity : 0.8349
Specificity : 0.7886
Pos Pred Value : 0.8058
Neg Pred Value : 0.8197
Prevalence : 0.5123
Detection Rate : 0.4278
Detection Prevalence : 0.5309
Balanced Accuracy : 0.8118
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 198 126
Yes 157 225
Accuracy : 0.5992
95% CI : (0.5619, 0.6355)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 1.719e-07
Kappa : 0.1987
Mcnemar's Test P-Value : 0.07454
Sensitivity : 0.6410
Specificity : 0.5577
Pos Pred Value : 0.5890
Neg Pred Value : 0.6111
Prevalence : 0.4972
Detection Rate : 0.3187
Detection Prevalence : 0.5411
Balanced Accuracy : 0.5994
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Heal Professionals`,pred_noeth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Heal Professionals`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Heal Professionals` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 41.79%
Confusion matrix:
No Yes class.error
No 428 362 0.4582278
Yes 315 515 0.3795181
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 658 89
Yes 132 741
Accuracy : 0.8636
95% CI : (0.8459, 0.8799)
No Information Rate : 0.5123
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7266
Mcnemar's Test P-Value : 0.004725
Sensitivity : 0.8928
Specificity : 0.8329
Pos Pred Value : 0.8488
Neg Pred Value : 0.8809
Prevalence : 0.5123
Detection Rate : 0.4574
Detection Prevalence : 0.5389
Balanced Accuracy : 0.8628
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 196 126
Yes 159 225
Accuracy : 0.5963
95% CI : (0.5591, 0.6327)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 3.792e-07
Kappa : 0.193
Mcnemar's Test P-Value : 0.05802
Sensitivity : 0.6410
Specificity : 0.5521
Pos Pred Value : 0.5859
Neg Pred Value : 0.6087
Prevalence : 0.4972
Detection Rate : 0.3187
Detection Prevalence : 0.5439
Balanced Accuracy : 0.5966
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Heal Professionals`,pred_eth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6281
pROC::auc(rocobj_wo)Area under the curve: 0.6295
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) No Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 6.8723526 5.274480 10.450012 63.11341
Age 6.7639350 2.862068 7.678002 144.71112
Gender 7.2051945 -4.657028 2.196017 22.19345
Religion 11.3213495 1.855341 11.712113 67.30961
Employment 2.0895049 -3.011594 -0.841149 18.22617
Income_median -0.6843792 12.339876 10.228645 22.95115
EnglishSpeak 9.4155890 15.952908 22.049464 51.37707
EnglishDiff 5.8334537 7.086395 10.370327 53.13316
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Heal Professionals`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 482 298
Yes 308 532
Accuracy : 0.6259
95% CI : (0.6018, 0.6496)
No Information Rate : 0.5123
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2512
Mcnemar's Test P-Value : 0.7147
Sensitivity : 0.6410
Specificity : 0.6101
Pos Pred Value : 0.6333
Neg Pred Value : 0.6179
Prevalence : 0.5123
Detection Rate : 0.3284
Detection Prevalence : 0.5185
Balanced Accuracy : 0.6255
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 198 123
Yes 157 228
Accuracy : 0.6034
95% CI : (0.5662, 0.6397)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 5.022e-08
Kappa : 0.2072
Mcnemar's Test P-Value : 0.0486
Sensitivity : 0.6496
Specificity : 0.5577
Pos Pred Value : 0.5922
Neg Pred Value : 0.6168
Prevalence : 0.4972
Detection Rate : 0.3229
Detection Prevalence : 0.5453
Balanced Accuracy : 0.6037
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Heal Professionals`,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6518
With ethnicity
Training Set
mod1 <- glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 488 302
Yes 302 528
Accuracy : 0.6272
95% CI : (0.6031, 0.6508)
No Information Rate : 0.5123
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2539
Mcnemar's Test P-Value : 1
Sensitivity : 0.6361
Specificity : 0.6177
Pos Pred Value : 0.6361
Neg Pred Value : 0.6177
Prevalence : 0.5123
Detection Rate : 0.3259
Detection Prevalence : 0.5123
Balanced Accuracy : 0.6269
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 202 129
Yes 153 222
Accuracy : 0.6006
95% CI : (0.5634, 0.6369)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 1.147e-07
Kappa : 0.2014
Mcnemar's Test P-Value : 0.1708
Sensitivity : 0.6325
Specificity : 0.5690
Pos Pred Value : 0.5920
Neg Pred Value : 0.6103
Prevalence : 0.4972
Detection Rate : 0.3144
Detection Prevalence : 0.5312
Balanced Accuracy : 0.6007
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Heal Professionals`,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6516
Logistic Regression (ANOVA)
glm(`Heal Professionals`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Heal Professionals` ~ Age + Ethnicity + Gender +
Religion + Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.069703 0.320868 -3.334 0.000857 ***
Age 0.013022 0.002963 4.395 1.11e-05 ***
EthnicityAsian Indian -0.332907 0.262160 -1.270 0.204133
EthnicityFilipino 0.309411 0.206642 1.497 0.134308
EthnicityKorean -0.422121 0.150156 -2.811 0.004936 **
EthnicityOther -0.132630 0.219938 -0.603 0.546487
EthnicityVietnamese 0.009209 0.157761 0.058 0.953453
GenderMale 0.003843 0.091936 0.042 0.966656
ReligionCatholic -0.261099 0.169243 -1.543 0.122893
ReligionHindu -0.613612 0.282048 -2.176 0.029588 *
ReligionMuslim 0.087267 0.341719 0.255 0.798433
ReligionNone -0.352720 0.170602 -2.068 0.038687 *
ReligionOther -0.030182 0.362158 -0.083 0.933583
ReligionProtestant -0.292490 0.171533 -1.705 0.088167 .
Income_medianAbove 0.350320 0.095955 3.651 0.000261 ***
EmploymentEmployed full time 0.015537 0.095084 0.163 0.870205
EnglishSpeakNot well 0.474961 0.235790 2.014 0.043974 *
EnglishSpeakVery well 1.504142 0.248281 6.058 1.38e-09 ***
EnglishSpeakWell 1.039967 0.235952 4.408 1.05e-05 ***
EnglishDiffMuch -0.374823 0.145344 -2.579 0.009912 **
EnglishDiffNot much -0.298402 0.135414 -2.204 0.027550 *
EnglishDiffVery much -0.324838 0.130984 -2.480 0.013139 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3224.0 on 2325 degrees of freedom
Residual deviance: 2992.9 on 2304 degrees of freedom
AIC: 3036.9
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Heal Professionals
LR Chisq Df Pr(>Chisq)
Age 19.553 1 9.782e-06 ***
Ethnicity 17.556 5 0.0035576 **
Gender 0.002 1 0.9666567
Religion 11.119 6 0.0847625 .
Income_median 13.344 1 0.0002592 ***
Employment 0.027 1 0.8702167
EnglishSpeak 60.295 3 5.085e-13 ***
EnglishDiff 9.252 3 0.0261213 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.237915 1 1.112616
Ethnicity 13.899874 5 1.301071
Gender 1.106695 1 1.051996
Religion 12.519910 6 1.234431
Income_median 1.209973 1 1.099988
Employment 1.184012 1 1.088123
EnglishSpeak 2.318122 3 1.150419
EnglishDiff 1.750171 3 1.097775
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Heal Professionals` ~ Age + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Model 2: `Heal Professionals` ~ Age + Ethnicity + Gender + Religion +
Income_median + Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2309 3010.4
2 2304 2992.9 5 17.556 0.003558 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 3044.447 3036.891
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 3142.229 3163.433
Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Imbalanced Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
imb <- imbalanced(`Health Insurance` ~ Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame(),
perf.type = "gmean")
print(imb) Sample size: 2326
Frequency of class labels: 318, 2008
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 406.2587
No. of variables tried at each split: 3
Total no. of variables: 7
Resampling used to grow trees: swor
Resample size used to grow trees: 1470
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.3145
(OOB) Brier score: 0.12231347
(OOB) Normalized Brier score: 0.48925389
(OOB) AUC: 0.68812016
(OOB) PR-AUC: 0.24133228
(OOB) G-mean: 0.6392782
(OOB) Requested performance error: 0.3607218
Confusion matrix:
predicted
observed 0 Yes class.error
0 186 132 0.4151
Yes 605 1403 0.3013
(OOB) Misclassification rate: 0.316853
imb_e <- imbalanced(`Health Insurance` ~ Ethnicity+Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame(),
perf.type = "gmean")
print(imb_e) Sample size: 2326
Frequency of class labels: 318, 2008
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 407.8643
No. of variables tried at each split: 3
Total no. of variables: 8
Resampling used to grow trees: swor
Resample size used to grow trees: 1470
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.3145
(OOB) Brier score: 0.11569482
(OOB) Normalized Brier score: 0.46277926
(OOB) AUC: 0.70981326
(OOB) PR-AUC: 0.2696439
(OOB) G-mean: 0.65367487
(OOB) Requested performance error: 0.34632513
Confusion matrix:
predicted
observed 0 Yes class.error
0 198 120 0.3774
Yes 630 1378 0.3137
(OOB) Misclassification rate: 0.322442
get.imbalanced.performance(imb_e) n.majority n.minority iratio threshold sens spec
2008.0000000 318.0000000 6.3144654 0.1367154 0.6226415 0.6862550
prec npv misclass brier brier.norm auc
0.2391304 0.9198932 0.3224420 0.1156948 0.4627793 0.7098133
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3455497 0.4800684 0.1367154 0.2696439 0.4996123 0.5668716
gmean
0.6536749
plot(imb_e,plots.one.page = FALSE)

all 0 Yes
EnglishSpeak 0.0267 NA NA
Employment 0.0107 NA NA
Income_median 0.0066 NA NA
Age 0.0028 NA NA
Ethnicity -0.0007 NA NA
Religion -0.0008 NA NA
Gender -0.0027 NA NA
EnglishDiff -0.0138 NA NA
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Health Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Health Insurance`=="Yes")
neg <- rfdata |> filter(`Health Insurance`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Health Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Health Insurance` ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.09%
Confusion matrix:
0 Yes class.error
0 15 204 0.931506849
Yes 8 1393 0.005710207
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 45 3
Yes 174 1398
Accuracy : 0.8907
95% CI : (0.8745, 0.9055)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 0.0009868
Kappa : 0.3032
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9979
Specificity : 0.2055
Pos Pred Value : 0.8893
Neg Pred Value : 0.9375
Prevalence : 0.8648
Detection Rate : 0.8630
Detection Prevalence : 0.9704
Balanced Accuracy : 0.6017
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 7 9
Yes 92 598
Accuracy : 0.8569
95% CI : (0.8289, 0.8819)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.6114
Kappa : 0.0861
Mcnemar's Test P-Value : 3.37e-16
Sensitivity : 0.98517
Specificity : 0.07071
Pos Pred Value : 0.86667
Neg Pred Value : 0.43750
Prevalence : 0.85977
Detection Rate : 0.84703
Detection Prevalence : 0.97734
Balanced Accuracy : 0.52794
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Health Insurance`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Health Insurance`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Health Insurance` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.15%
Confusion matrix:
0 Yes class.error
0 19 200 0.913242009
Yes 13 1388 0.009279086
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 68 3
Yes 151 1398
Accuracy : 0.9049
95% CI : (0.8896, 0.9188)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 4.747e-07
Kappa : 0.4313
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9979
Specificity : 0.3105
Pos Pred Value : 0.9025
Neg Pred Value : 0.9577
Prevalence : 0.8648
Detection Rate : 0.8630
Detection Prevalence : 0.9562
Balanced Accuracy : 0.6542
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 7 9
Yes 92 598
Accuracy : 0.8569
95% CI : (0.8289, 0.8819)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.6114
Kappa : 0.0861
Mcnemar's Test P-Value : 3.37e-16
Sensitivity : 0.98517
Specificity : 0.07071
Pos Pred Value : 0.86667
Neg Pred Value : 0.43750
Prevalence : 0.85977
Detection Rate : 0.84703
Detection Prevalence : 0.97734
Balanced Accuracy : 0.52794
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Health Insurance`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6595
pROC::auc(rocobj_wo)Area under the curve: 0.6492
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity -1.2482765 20.160916 19.770442 29.08883
Age 5.5208083 11.939043 14.057638 64.56691
Gender -8.3547869 9.028860 4.643291 10.21576
Religion -0.4123487 13.077109 12.499687 30.95355
Employment 3.1321154 14.784372 15.647581 11.03695
Income_median 17.5013381 6.595574 13.646661 20.21276
EnglishSpeak 19.9324178 12.465397 20.155768 29.43936
EnglishDiff -4.2226014 12.881532 10.593832 23.25288
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Health Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 15 16
Yes 204 1385
Accuracy : 0.8642
95% CI : (0.8465, 0.8805)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 0.5469
Kappa : 0.0895
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98858
Specificity : 0.06849
Pos Pred Value : 0.87162
Neg Pred Value : 0.48387
Prevalence : 0.86481
Detection Rate : 0.85494
Detection Prevalence : 0.98086
Balanced Accuracy : 0.52854
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 11 12
Yes 88 595
Accuracy : 0.8584
95% CI : (0.8304, 0.8832)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.5695
Kappa : 0.1346
Mcnemar's Test P-Value : 6.382e-14
Sensitivity : 0.9802
Specificity : 0.1111
Pos Pred Value : 0.8712
Neg Pred Value : 0.4783
Prevalence : 0.8598
Detection Rate : 0.8428
Detection Prevalence : 0.9674
Balanced Accuracy : 0.5457
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Health Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6865
With ethnicity
Training Set
mod1 <- glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 18 15
Yes 201 1386
Accuracy : 0.8667
95% CI : (0.8491, 0.8829)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 0.4313
Kappa : 0.1114
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98929
Specificity : 0.08219
Pos Pred Value : 0.87335
Neg Pred Value : 0.54545
Prevalence : 0.86481
Detection Rate : 0.85556
Detection Prevalence : 0.97963
Balanced Accuracy : 0.53574
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 10 10
Yes 89 597
Accuracy : 0.8598
95% CI : (0.8319, 0.8845)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.5268
Kappa : 0.1269
Mcnemar's Test P-Value : 4.531e-15
Sensitivity : 0.9835
Specificity : 0.1010
Pos Pred Value : 0.8703
Neg Pred Value : 0.5000
Prevalence : 0.8598
Detection Rate : 0.8456
Detection Prevalence : 0.9717
Balanced Accuracy : 0.5423
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Health Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6843
Logistic Regression (ANOVA)
glm(`Health Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Health Insurance` ~ Age + Ethnicity + Gender +
Religion + Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.510521 0.395212 -1.292 0.19644
Age 0.011330 0.004033 2.810 0.00496 **
EthnicityAsian Indian -0.486014 0.408447 -1.190 0.23408
EthnicityFilipino -0.471155 0.317208 -1.485 0.13746
EthnicityKorean -0.607176 0.219908 -2.761 0.00576 **
EthnicityOther -0.791527 0.307064 -2.578 0.00995 **
EthnicityVietnamese -0.317983 0.232255 -1.369 0.17096
GenderMale -0.100018 0.134143 -0.746 0.45590
ReligionCatholic 0.397173 0.249217 1.594 0.11101
ReligionHindu 0.146376 0.434478 0.337 0.73619
ReligionMuslim -0.152987 0.456051 -0.335 0.73728
ReligionNone 0.135992 0.243118 0.559 0.57591
ReligionOther -0.956076 0.420425 -2.274 0.02296 *
ReligionProtestant 0.053400 0.246001 0.217 0.82815
Income_medianAbove 1.172950 0.155904 7.524 5.33e-14 ***
EmploymentEmployed full time 0.830118 0.150618 5.511 3.56e-08 ***
EnglishSpeakNot well 1.412586 0.248901 5.675 1.38e-08 ***
EnglishSpeakVery well 1.880227 0.284707 6.604 4.00e-11 ***
EnglishSpeakWell 1.533159 0.253987 6.036 1.58e-09 ***
EnglishDiffMuch 0.047166 0.219994 0.214 0.83024
EnglishDiffNot much 0.026306 0.206698 0.127 0.89873
EnglishDiffVery much -0.202149 0.203607 -0.993 0.32079
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1855.9 on 2325 degrees of freedom
Residual deviance: 1619.3 on 2304 degrees of freedom
AIC: 1663.3
Number of Fisher Scoring iterations: 5
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Health Insurance
LR Chisq Df Pr(>Chisq)
Age 8.081 1 0.004474 **
Ethnicity 11.070 5 0.050002 .
Gender 0.555 1 0.456198
Religion 11.367 6 0.077678 .
Income_median 62.892 1 2.184e-15 ***
Employment 32.437 1 1.231e-08 ***
EnglishSpeak 46.951 3 3.559e-10 ***
EnglishDiff 1.692 3 0.638665
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.302451 1 1.141250
Ethnicity 14.391565 5 1.305602
Gender 1.076528 1 1.037559
Religion 12.067377 6 1.230650
Income_median 1.113409 1 1.055182
Employment 1.121284 1 1.058907
EnglishSpeak 2.569776 3 1.170350
EnglishDiff 1.816261 3 1.104578
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Health Insurance` ~ Age + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Model 2: `Health Insurance` ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2309 1630.3
2 2304 1619.3 5 11.07 0.05 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 1664.353 1663.283
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 1762.135 1789.824
Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Dental Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Dental Insurance`=="Yes")
neg <- rfdata |> filter(`Dental Insurance`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Dental Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Dental Insurance` ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 26.76%
Confusion matrix:
0 Yes class.error
0 390 248 0.3887147
Yes 185 795 0.1887755
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 488 106
Yes 150 874
Accuracy : 0.8418
95% CI : (0.8231, 0.8592)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6647
Mcnemar's Test P-Value : 0.007199
Sensitivity : 0.8918
Specificity : 0.7649
Pos Pred Value : 0.8535
Neg Pred Value : 0.8215
Prevalence : 0.6057
Detection Rate : 0.5402
Detection Prevalence : 0.6329
Balanced Accuracy : 0.8284
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 179 78
Yes 98 348
Accuracy : 0.7496
95% CI : (0.7159, 0.7813)
No Information Rate : 0.606
P-Value [Acc > NIR] : 6.607e-16
Kappa : 0.469
Mcnemar's Test P-Value : 0.1521
Sensitivity : 0.8169
Specificity : 0.6462
Pos Pred Value : 0.7803
Neg Pred Value : 0.6965
Prevalence : 0.6060
Detection Rate : 0.4950
Detection Prevalence : 0.6344
Balanced Accuracy : 0.7316
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dental Insurance`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Dental Insurance`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Dental Insurance` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 26.08%
Confusion matrix:
0 Yes class.error
0 389 249 0.3902821
Yes 173 807 0.1765306
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 523 82
Yes 115 898
Accuracy : 0.8782
95% CI : (0.8613, 0.8938)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.7428
Mcnemar's Test P-Value : 0.02261
Sensitivity : 0.9163
Specificity : 0.8197
Pos Pred Value : 0.8865
Neg Pred Value : 0.8645
Prevalence : 0.6057
Detection Rate : 0.5550
Detection Prevalence : 0.6261
Balanced Accuracy : 0.8680
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 179 75
Yes 98 351
Accuracy : 0.7539
95% CI : (0.7203, 0.7853)
No Information Rate : 0.606
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4771
Mcnemar's Test P-Value : 0.0944
Sensitivity : 0.8239
Specificity : 0.6462
Pos Pred Value : 0.7817
Neg Pred Value : 0.7047
Prevalence : 0.6060
Detection Rate : 0.4993
Detection Prevalence : 0.6387
Balanced Accuracy : 0.7351
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dental Insurance`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.8034
pROC::auc(rocobj_wo)Area under the curve: 0.8062
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 7.557246 20.593481 22.852662 58.44735
Age 5.145640 30.689138 28.358535 126.24484
Gender -6.552143 11.887279 5.323852 19.62499
Religion -3.098080 16.360793 10.826548 54.69787
Employment 17.265627 27.402089 33.314185 38.27099
Income_median 44.663947 34.266866 53.831127 85.47970
EnglishSpeak 23.954258 18.251501 32.612176 57.60092
EnglishDiff 4.995635 4.418307 7.379507 42.60936
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Dental Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 387 172
Yes 251 808
Accuracy : 0.7386
95% CI : (0.7164, 0.7598)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4406
Mcnemar's Test P-Value : 0.0001491
Sensitivity : 0.8245
Specificity : 0.6066
Pos Pred Value : 0.7630
Neg Pred Value : 0.6923
Prevalence : 0.6057
Detection Rate : 0.4994
Detection Prevalence : 0.6545
Balanced Accuracy : 0.7155
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 173 73
Yes 104 353
Accuracy : 0.7482
95% CI : (0.7144, 0.7799)
No Information Rate : 0.606
P-Value [Acc > NIR] : 1.289e-15
Kappa : 0.4622
Mcnemar's Test P-Value : 0.02414
Sensitivity : 0.8286
Specificity : 0.6245
Pos Pred Value : 0.7724
Neg Pred Value : 0.7033
Prevalence : 0.6060
Detection Rate : 0.5021
Detection Prevalence : 0.6501
Balanced Accuracy : 0.7266
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Dental Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.8111
With ethnicity
Training Set
mod1 <- glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 390 170
Yes 248 810
Accuracy : 0.7417
95% CI : (0.7196, 0.7628)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4474
Mcnemar's Test P-Value : 0.0001658
Sensitivity : 0.8265
Specificity : 0.6113
Pos Pred Value : 0.7656
Neg Pred Value : 0.6964
Prevalence : 0.6057
Detection Rate : 0.5006
Detection Prevalence : 0.6539
Balanced Accuracy : 0.7189
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 180 76
Yes 97 350
Accuracy : 0.7539
95% CI : (0.7203, 0.7853)
No Information Rate : 0.606
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4777
Mcnemar's Test P-Value : 0.1284
Sensitivity : 0.8216
Specificity : 0.6498
Pos Pred Value : 0.7830
Neg Pred Value : 0.7031
Prevalence : 0.6060
Detection Rate : 0.4979
Detection Prevalence : 0.6358
Balanced Accuracy : 0.7357
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Dental Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.8105
pROC::auc(rocobj_wo)Area under the curve: 0.8111
Logistic Regression (ANOVA)
glm(`Dental Insurance`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Dental Insurance` ~ Age + Ethnicity + Gender +
Religion + Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.257474 0.360477 -3.488 0.000486 ***
Age 0.002352 0.003245 0.725 0.468647
EthnicityAsian Indian 0.362283 0.332939 1.088 0.276536
EthnicityFilipino 0.324481 0.238777 1.359 0.174169
EthnicityKorean -0.674130 0.166433 -4.050 5.11e-05 ***
EthnicityOther 0.034767 0.249577 0.139 0.889210
EthnicityVietnamese -0.067497 0.175481 -0.385 0.700502
GenderMale -0.394035 0.104875 -3.757 0.000172 ***
ReligionCatholic 0.040134 0.188669 0.213 0.831545
ReligionHindu -0.924886 0.352779 -2.622 0.008749 **
ReligionMuslim -1.113568 0.395805 -2.813 0.004902 **
ReligionNone -0.204393 0.190361 -1.074 0.282951
ReligionOther -0.658405 0.401865 -1.638 0.101344
ReligionProtestant -0.225753 0.190603 -1.184 0.236251
Income_medianAbove 1.502185 0.107486 13.976 < 2e-16 ***
EmploymentEmployed full time 1.011055 0.106546 9.489 < 2e-16 ***
EnglishSpeakNot well 0.863201 0.267525 3.227 0.001253 **
EnglishSpeakVery well 1.670884 0.281184 5.942 2.81e-09 ***
EnglishSpeakWell 1.120356 0.267330 4.191 2.78e-05 ***
EnglishDiffMuch -0.387073 0.166355 -2.327 0.019976 *
EnglishDiffNot much -0.157066 0.156553 -1.003 0.315729
EnglishDiffVery much -0.330735 0.155169 -2.131 0.033052 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3112.9 on 2320 degrees of freedom
Residual deviance: 2449.7 on 2299 degrees of freedom
AIC: 2493.7
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Dental Insurance
LR Chisq Df Pr(>Chisq)
Age 0.525 1 0.4685489
Ethnicity 29.033 5 2.285e-05 ***
Gender 14.288 1 0.0001569 ***
Religion 13.248 6 0.0392639 *
Income_median 208.566 1 < 2.2e-16 ***
Employment 92.560 1 < 2.2e-16 ***
EnglishSpeak 44.681 3 1.082e-09 ***
EnglishDiff 7.845 3 0.0493355 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.203437 1 1.097013
Ethnicity 16.057430 5 1.319981
Gender 1.114622 1 1.055757
Religion 14.485593 6 1.249524
Income_median 1.136711 1 1.066166
Employment 1.118303 1 1.057498
EnglishSpeak 2.181614 3 1.138841
EnglishDiff 1.787319 3 1.101625
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Dental Insurance` ~ Age + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Model 2: `Dental Insurance` ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2304 2478.8
2 2299 2449.7 5 29.033 2.285e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2512.762 2493.73
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2610.508 2620.224
Urgent Care utilization in the past 12 months
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Imbalanced Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
imb <- imbalanced(`Urgentcare`~ Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame())
print(imb) Sample size: 2297
Frequency of class labels: 1901, 396
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 524.915
No. of variables tried at each split: 3
Total no. of variables: 7
Resampling used to grow trees: swor
Resample size used to grow trees: 1452
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 4.8005
(OOB) Brier score: 0.16619437
(OOB) Normalized Brier score: 0.66477747
(OOB) AUC: 0.50623542
(OOB) PR-AUC: 0.1847879
(OOB) G-mean: 0.4908176
(OOB) Requested performance error: 0.5091824
Confusion matrix:
predicted
observed 0 Yes class.error
0 1170 731 0.3845
Yes 241 155 0.6086
(OOB) Misclassification rate: 0.4231606
imb_e <- imbalanced(`Urgentcare` ~ Ethnicity+Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame())
print(imb_e) Sample size: 2297
Frequency of class labels: 1901, 396
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 527.1183
No. of variables tried at each split: 3
Total no. of variables: 8
Resampling used to grow trees: swor
Resample size used to grow trees: 1452
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 4.8005
(OOB) Brier score: 0.16102889
(OOB) Normalized Brier score: 0.64411556
(OOB) AUC: 0.51717065
(OOB) PR-AUC: 0.1855978
(OOB) G-mean: 0.50439226
(OOB) Requested performance error: 0.49560774
Confusion matrix:
predicted
observed 0 Yes class.error
0 1140 761 0.4003
Yes 228 168 0.5758
(OOB) Misclassification rate: 0.4305616
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Urgentcare`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Urgentcare`=="Yes")
neg <- rfdata |> filter(`Urgentcare`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Urgentcare`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Urgentcare ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 17.35%
Confusion matrix:
0 Yes class.error
0 1314 6 0.004545455
Yes 271 6 0.978339350
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 234
Yes 0 43
Accuracy : 0.8535
95% CI : (0.8352, 0.8705)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 0.002102
Kappa : 0.233
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.15523
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.84942
Prevalence : 0.17345
Detection Rate : 0.02693
Detection Prevalence : 0.02693
Balanced Accuracy : 0.57762
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 576 116
Yes 5 3
Accuracy : 0.8271
95% CI : (0.7971, 0.8544)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.6033
Kappa : 0.0264
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.025210
Specificity : 0.991394
Pos Pred Value : 0.375000
Neg Pred Value : 0.832370
Prevalence : 0.170000
Detection Rate : 0.004286
Detection Prevalence : 0.011429
Balanced Accuracy : 0.508302
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Urgentcare`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Urgentcare`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Urgentcare ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 17.35%
Confusion matrix:
0 Yes class.error
0 1315 5 0.003787879
Yes 272 5 0.981949458
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 211
Yes 0 66
Accuracy : 0.8679
95% CI : (0.8503, 0.8841)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 3.859e-06
Kappa : 0.3408
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.23827
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.86218
Prevalence : 0.17345
Detection Rate : 0.04133
Detection Prevalence : 0.04133
Balanced Accuracy : 0.61913
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 578 117
Yes 3 2
Accuracy : 0.8286
95% CI : (0.7986, 0.8558)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5642
Kappa : 0.0188
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.016807
Specificity : 0.994836
Pos Pred Value : 0.400000
Neg Pred Value : 0.831655
Prevalence : 0.170000
Detection Rate : 0.002857
Detection Prevalence : 0.007143
Balanced Accuracy : 0.505822
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Urgentcare`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.545
pROC::auc(rocobj_wo)Area under the curve: 0.5138
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 11.6145353 -7.8065669 8.7922079 32.88142
Age 6.7558639 4.5096817 8.4237882 97.48854
Gender 0.6559941 -0.3758339 0.4770087 12.41594
Religion 14.9277774 -6.9584205 12.9841732 34.60881
Employment 8.4982020 -7.5176839 5.4687781 11.38487
Income_median 2.7244192 0.9340281 3.1566536 12.61743
EnglishSpeak 10.7440682 -1.3385698 10.2700631 23.21576
EnglishDiff 7.8835926 -2.2650121 6.7297928 28.40862
Urgent care IS NOT really predicted well. Might as well just say “yes” to everything since accuracy is not significantly different from NIR.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Urgentcare`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 277
Yes 0 0
Accuracy : 0.8265
95% CI : (0.8071, 0.8448)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 0.516
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8265
Prevalence : 0.1735
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 581 119
Yes 0 0
Accuracy : 0.83
95% CI : (0.8001, 0.8571)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5245
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.00
Specificity : 1.00
Pos Pred Value : NaN
Neg Pred Value : 0.83
Prevalence : 0.17
Detection Rate : 0.00
Detection Prevalence : 0.00
Balanced Accuracy : 0.50
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Urgentcare`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.556
With ethnicity
Training Set
mod1 <- glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 277
Yes 0 0
Accuracy : 0.8265
95% CI : (0.8071, 0.8448)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 0.516
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8265
Prevalence : 0.1735
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 581 119
Yes 0 0
Accuracy : 0.83
95% CI : (0.8001, 0.8571)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5245
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.00
Specificity : 1.00
Pos Pred Value : NaN
Neg Pred Value : 0.83
Prevalence : 0.17
Detection Rate : 0.00
Detection Prevalence : 0.00
Balanced Accuracy : 0.50
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Urgentcare`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.5549
Logistic Regression (ANOVA)
glm(`Urgentcare`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Urgentcare ~ Age + Ethnicity + Gender + Religion +
Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.713487 0.435037 -6.237 4.45e-10 ***
Age 0.010618 0.003724 2.851 0.00436 **
EthnicityAsian Indian 0.061872 0.353773 0.175 0.86116
EthnicityFilipino 0.293290 0.249872 1.174 0.24049
EthnicityKorean 0.344107 0.200957 1.712 0.08683 .
EthnicityOther 0.299656 0.283000 1.059 0.28967
EthnicityVietnamese 0.432767 0.206818 2.093 0.03639 *
GenderMale 0.069658 0.117412 0.593 0.55300
ReligionCatholic 0.194386 0.204034 0.953 0.34073
ReligionHindu 0.093071 0.369859 0.252 0.80132
ReligionMuslim -0.421902 0.488375 -0.864 0.38765
ReligionNone -0.235401 0.226420 -1.040 0.29850
ReligionOther -0.066064 0.451691 -0.146 0.88372
ReligionProtestant -0.070257 0.219278 -0.320 0.74866
Income_medianAbove 0.031820 0.122729 0.259 0.79542
EmploymentEmployed full time 0.040204 0.120720 0.333 0.73911
EnglishSpeakNot well 0.148020 0.331690 0.446 0.65541
EnglishSpeakVery well 0.616306 0.342263 1.801 0.07175 .
EnglishSpeakWell 0.464274 0.329572 1.409 0.15892
EnglishDiffMuch 0.074610 0.182922 0.408 0.68336
EnglishDiffNot much 0.058714 0.170863 0.344 0.73112
EnglishDiffVery much -0.283294 0.172827 -1.639 0.10118
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2111.7 on 2296 degrees of freedom
Residual deviance: 2068.0 on 2275 degrees of freedom
AIC: 2112
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Urgentcare
LR Chisq Df Pr(>Chisq)
Age 8.0592 1 0.004527 **
Ethnicity 5.6501 5 0.341765
Gender 0.3517 1 0.553134
Religion 5.6203 6 0.467034
Income_median 0.0672 1 0.795419
Employment 0.1109 1 0.739154
EnglishSpeak 6.9808 3 0.072511 .
EnglishDiff 4.2913 3 0.231675
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.190407 1 1.091058
Ethnicity 14.637287 5 1.307814
Gender 1.104873 1 1.051129
Religion 12.935297 6 1.237793
Income_median 1.210429 1 1.100195
Employment 1.170741 1 1.082008
EnglishSpeak 2.262842 3 1.145801
EnglishDiff 1.776405 3 1.100501
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Urgentcare ~ Age + Gender + Religion + Income_median + Employment +
EnglishSpeak + EnglishDiff
Model 2: Urgentcare ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2280 2073.7
2 2275 2068.0 5 5.6501 0.3418
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2107.682 2112.032
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2205.251 2238.298
Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Imbalanced Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
imb <- imbalanced(`Physical Check-up` ~ Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame())
print(imb) Sample size: 2312
Frequency of class labels: 749, 1563
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 664.8643
No. of variables tried at each split: 3
Total no. of variables: 7
Resampling used to grow trees: swor
Resample size used to grow trees: 1461
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 2.0868
(OOB) Brier score: 0.22802431
(OOB) Normalized Brier score: 0.91209725
(OOB) AUC: 0.62964482
(OOB) PR-AUC: 0.43369316
(OOB) G-mean: 0.59465502
(OOB) Requested performance error: 0.40534498
Confusion matrix:
predicted
observed 0 Yes class.error
0 419 330 0.4406
Yes 575 988 0.3679
(OOB) Misclassification rate: 0.391436
imb_e <- imbalanced(`Physical Check-up` ~ Ethnicity+Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,importance=T,data=rfdata |> as.data.frame())
print(imb_e) Sample size: 2312
Frequency of class labels: 749, 1563
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 661.116
No. of variables tried at each split: 3
Total no. of variables: 8
Resampling used to grow trees: swor
Resample size used to grow trees: 1461
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 2.0868
(OOB) Brier score: 0.21922444
(OOB) Normalized Brier score: 0.87689775
(OOB) AUC: 0.6442072
(OOB) PR-AUC: 0.45199226
(OOB) G-mean: 0.60686387
(OOB) Requested performance error: 0.39313613
Confusion matrix:
predicted
observed 0 Yes class.error
0 429 320 0.4272
Yes 558 1005 0.3570
(OOB) Misclassification rate: 0.3797578
get.imbalanced.performance(imb_e) n.majority n.minority iratio threshold sens spec
1563.0000000 749.0000000 2.0867824 0.3239619 0.5727637 0.6429942
prec npv misclass brier brier.norm auc
0.4346505 0.7584906 0.3797578 0.2192244 0.8768977 0.6442072
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.4942396 0.5780136 0.3239619 0.4519923 0.5505517 0.5924388
gmean
0.6068639
imb_e$importance all 0 Yes
Ethnicity -0.014943794 NA NA
Age 0.009402163 NA NA
Gender -0.003275708 NA NA
Religion -0.025418225 NA NA
Income_median 0.001503733 NA NA
Employment -0.006740685 NA NA
EnglishSpeak -0.008992453 NA NA
EnglishDiff -0.026730015 NA NA
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Physical Check-up`=="Yes")
neg <- rfdata |> filter(`Physical Check-up`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Physical Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Physical Check-up` ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 30.95%
Confusion matrix:
0 Yes class.error
0 134 385 0.7418112
Yes 113 977 0.1036697
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 283 24
Yes 236 1066
Accuracy : 0.8384
95% CI : (0.8195, 0.8561)
No Information Rate : 0.6774
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.586
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9780
Specificity : 0.5453
Pos Pred Value : 0.8187
Neg Pred Value : 0.9218
Prevalence : 0.6774
Detection Rate : 0.6625
Detection Prevalence : 0.8092
Balanced Accuracy : 0.7616
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 59 56
Yes 171 417
Accuracy : 0.6771
95% CI : (0.6411, 0.7116)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.4221
Kappa : 0.1585
Mcnemar's Test P-Value : 3.836e-14
Sensitivity : 0.8816
Specificity : 0.2565
Pos Pred Value : 0.7092
Neg Pred Value : 0.5130
Prevalence : 0.6728
Detection Rate : 0.5932
Detection Prevalence : 0.8364
Balanced Accuracy : 0.5691
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Physical Check-up`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Physical Check-up`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Physical Check-up` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 31.39%
Confusion matrix:
0 Yes class.error
0 141 378 0.7283237
Yes 127 963 0.1165138
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 339 21
Yes 180 1069
Accuracy : 0.8751
95% CI : (0.8579, 0.8908)
No Information Rate : 0.6774
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6892
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9807
Specificity : 0.6532
Pos Pred Value : 0.8559
Neg Pred Value : 0.9417
Prevalence : 0.6774
Detection Rate : 0.6644
Detection Prevalence : 0.7763
Balanced Accuracy : 0.8170
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 63 56
Yes 167 417
Accuracy : 0.6828
95% CI : (0.647, 0.7171)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.3018
Kappa : 0.1775
Mcnemar's Test P-Value : 1.756e-13
Sensitivity : 0.8816
Specificity : 0.2739
Pos Pred Value : 0.7140
Neg Pred Value : 0.5294
Prevalence : 0.6728
Detection Rate : 0.5932
Detection Prevalence : 0.8307
Balanced Accuracy : 0.5778
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Physical Check-up`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6731
pROC::auc(rocobj_wo)Area under the curve: 0.6575
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity -3.087375 13.95757 11.099789 53.43316
Age 3.676369 32.92101 28.802195 138.98329
Gender -2.387634 12.62020 9.452973 21.99246
Religion -9.729874 14.92892 8.682015 58.50328
Employment -6.546450 18.28596 13.654564 19.25623
Income_median -8.216131 24.55044 18.117669 24.37976
EnglishSpeak -3.384102 19.75072 18.404527 40.81256
EnglishDiff -1.236171 13.64378 11.390362 46.50875
No change between accuracy and no information rate on the test set.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Physical Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 171 101
Yes 348 989
Accuracy : 0.7209
95% CI : (0.6983, 0.7428)
No Information Rate : 0.6774
P-Value [Acc > NIR] : 8.82e-05
Kappa : 0.2705
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9073
Specificity : 0.3295
Pos Pred Value : 0.7397
Neg Pred Value : 0.6287
Prevalence : 0.6774
Detection Rate : 0.6147
Detection Prevalence : 0.8310
Balanced Accuracy : 0.6184
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 58 49
Yes 172 424
Accuracy : 0.6856
95% CI : (0.6499, 0.7198)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.248
Kappa : 0.1722
Mcnemar's Test P-Value : 2.275e-16
Sensitivity : 0.8964
Specificity : 0.2522
Pos Pred Value : 0.7114
Neg Pred Value : 0.5421
Prevalence : 0.6728
Detection Rate : 0.6031
Detection Prevalence : 0.8478
Balanced Accuracy : 0.5743
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Physical Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6807
With ethnicity
Training Set
mod1 <- glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 169 93
Yes 350 997
Accuracy : 0.7247
95% CI : (0.7021, 0.7464)
No Information Rate : 0.6774
P-Value [Acc > NIR] : 2.256e-05
Kappa : 0.2761
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9147
Specificity : 0.3256
Pos Pred Value : 0.7402
Neg Pred Value : 0.6450
Prevalence : 0.6774
Detection Rate : 0.6196
Detection Prevalence : 0.8372
Balanced Accuracy : 0.6202
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 66 57
Yes 164 416
Accuracy : 0.6856
95% CI : (0.6499, 0.7198)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.248
Kappa : 0.189
Mcnemar's Test P-Value : 1.001e-12
Sensitivity : 0.8795
Specificity : 0.2870
Pos Pred Value : 0.7172
Neg Pred Value : 0.5366
Prevalence : 0.6728
Detection Rate : 0.5917
Detection Prevalence : 0.8250
Balanced Accuracy : 0.5832
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Physical Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.6968
pROC::auc(rocobj_wo)Area under the curve: 0.6807
Logistic Regression (ANOVA)
glm(`Physical Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Physical Check-up` ~ Age + Ethnicity + Gender +
Religion + Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.676654 0.341410 -4.911 9.06e-07 ***
Age 0.036935 0.003449 10.710 < 2e-16 ***
EthnicityAsian Indian -0.406560 0.282712 -1.438 0.150413
EthnicityFilipino -0.090802 0.231634 -0.392 0.695054
EthnicityKorean -0.676520 0.161726 -4.183 2.88e-05 ***
EthnicityOther -0.485989 0.236236 -2.057 0.039665 *
EthnicityVietnamese 0.300147 0.178455 1.682 0.092585 .
GenderMale -0.678812 0.101920 -6.660 2.73e-11 ***
ReligionCatholic 0.274057 0.192759 1.422 0.155096
ReligionHindu 0.299968 0.304942 0.984 0.325270
ReligionMuslim 0.286958 0.365679 0.785 0.432613
ReligionNone 0.192141 0.188578 1.019 0.308253
ReligionOther -0.174811 0.369064 -0.474 0.635743
ReligionProtestant 0.078148 0.190495 0.410 0.681631
Income_medianAbove 0.688117 0.105685 6.511 7.46e-11 ***
EmploymentEmployed full time 0.235013 0.105392 2.230 0.025754 *
EnglishSpeakNot well 0.887367 0.233141 3.806 0.000141 ***
EnglishSpeakVery well 1.460774 0.251893 5.799 6.66e-09 ***
EnglishSpeakWell 1.226867 0.236292 5.192 2.08e-07 ***
EnglishDiffMuch -0.276886 0.164611 -1.682 0.092557 .
EnglishDiffNot much -0.543283 0.152471 -3.563 0.000366 ***
EnglishDiffVery much -0.424588 0.146513 -2.898 0.003756 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2912.3 on 2311 degrees of freedom
Residual deviance: 2595.6 on 2290 degrees of freedom
AIC: 2639.6
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Physical Check-up
LR Chisq Df Pr(>Chisq)
Age 127.697 1 < 2.2e-16 ***
Ethnicity 36.514 5 7.493e-07 ***
Gender 45.332 1 1.663e-11 ***
Religion 4.199 6 0.649743
Income_median 43.104 1 5.191e-11 ***
Employment 4.986 1 0.025551 *
EnglishSpeak 36.256 3 6.610e-08 ***
EnglishDiff 15.945 3 0.001164 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.318613 1 1.148309
Ethnicity 14.056188 5 1.302527
Gender 1.140810 1 1.068087
Religion 12.367064 6 1.233168
Income_median 1.212406 1 1.101093
Employment 1.207575 1 1.098897
EnglishSpeak 2.519487 3 1.166502
EnglishDiff 1.823482 3 1.105309
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Physical Check-up` ~ Age + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Model 2: `Physical Check-up` ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2295 2632.1
2 2290 2595.6 5 36.514 7.493e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2666.094 2639.58
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2763.774 2765.989
Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Dentist Check-up`=="Yes")
neg <- rfdata |> filter(`Dentist Check-up`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Dentist Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Dentist Check-up` ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 32.75%
Confusion matrix:
0 Yes class.error
0 338 325 0.4901961
Yes 200 740 0.2127660
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 494 83
Yes 169 857
Accuracy : 0.8428
95% CI : (0.824, 0.8603)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6696
Mcnemar's Test P-Value : 8.579e-08
Sensitivity : 0.9117
Specificity : 0.7451
Pos Pred Value : 0.8353
Neg Pred Value : 0.8562
Prevalence : 0.5864
Detection Rate : 0.5346
Detection Prevalence : 0.6400
Balanced Accuracy : 0.8284
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 127 106
Yes 167 302
Accuracy : 0.6111
95% CI : (0.5739, 0.6474)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0580220
Kappa : 0.1773
Mcnemar's Test P-Value : 0.0002819
Sensitivity : 0.7402
Specificity : 0.4320
Pos Pred Value : 0.6439
Neg Pred Value : 0.5451
Prevalence : 0.5812
Detection Rate : 0.4302
Detection Prevalence : 0.6681
Balanced Accuracy : 0.5861
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Dentist Check-up`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Dentist Check-up` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 32.56%
Confusion matrix:
0 Yes class.error
0 356 307 0.4630468
Yes 215 725 0.2287234
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 536 71
Yes 127 869
Accuracy : 0.8765
95% CI : (0.8594, 0.8922)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7422
Mcnemar's Test P-Value : 9.28e-05
Sensitivity : 0.9245
Specificity : 0.8084
Pos Pred Value : 0.8725
Neg Pred Value : 0.8830
Prevalence : 0.5864
Detection Rate : 0.5421
Detection Prevalence : 0.6213
Balanced Accuracy : 0.8665
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 134 108
Yes 160 300
Accuracy : 0.6182
95% CI : (0.5811, 0.6543)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.025155
Kappa : 0.1959
Mcnemar's Test P-Value : 0.001837
Sensitivity : 0.7353
Specificity : 0.4558
Pos Pred Value : 0.6522
Neg Pred Value : 0.5537
Prevalence : 0.5812
Detection Rate : 0.4274
Detection Prevalence : 0.6553
Balanced Accuracy : 0.5955
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dentist Check-up`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.664
pROC::auc(rocobj_wo)Area under the curve: 0.6475
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 5.914364 16.533647 21.773611 62.73446
Age 15.716389 24.579291 30.575646 144.09703
Gender 9.438840 2.980741 9.005174 23.20846
Religion 1.614406 18.851975 20.026908 67.94336
Employment 10.150475 6.196819 14.707543 20.87180
Income_median 16.228358 19.238205 25.864354 34.96555
EnglishSpeak 14.079554 19.122658 26.481921 55.06404
EnglishDiff 7.434635 7.671219 11.747027 50.62556
Accuracy slightly decreased when ethnicity is added to the model, but AUC increased.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Dentist Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 338 186
Yes 325 754
Accuracy : 0.6812
95% CI : (0.6578, 0.704)
No Information Rate : 0.5864
P-Value [Acc > NIR] : 3.194e-15
Kappa : 0.3219
Mcnemar's Test P-Value : 1.030e-09
Sensitivity : 0.8021
Specificity : 0.5098
Pos Pred Value : 0.6988
Neg Pred Value : 0.6450
Prevalence : 0.5864
Detection Rate : 0.4704
Detection Prevalence : 0.6731
Balanced Accuracy : 0.6560
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 142 95
Yes 152 313
Accuracy : 0.6481
95% CI : (0.6115, 0.6835)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0001672
Kappa : 0.2571
Mcnemar's Test P-Value : 0.0003664
Sensitivity : 0.7672
Specificity : 0.4830
Pos Pred Value : 0.6731
Neg Pred Value : 0.5992
Prevalence : 0.5812
Detection Rate : 0.4459
Detection Prevalence : 0.6624
Balanced Accuracy : 0.6251
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6786
With ethnicity
Training Set
mod1 <- glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 345 183
Yes 318 757
Accuracy : 0.6875
95% CI : (0.6641, 0.7101)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3358
Mcnemar's Test P-Value : 2.142e-09
Sensitivity : 0.8053
Specificity : 0.5204
Pos Pred Value : 0.7042
Neg Pred Value : 0.6534
Prevalence : 0.5864
Detection Rate : 0.4722
Detection Prevalence : 0.6706
Balanced Accuracy : 0.6628
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 143 100
Yes 151 308
Accuracy : 0.6425
95% CI : (0.6057, 0.678)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0005277
Kappa : 0.2473
Mcnemar's Test P-Value : 0.0015996
Sensitivity : 0.7549
Specificity : 0.4864
Pos Pred Value : 0.6710
Neg Pred Value : 0.5885
Prevalence : 0.5812
Detection Rate : 0.4387
Detection Prevalence : 0.6538
Balanced Accuracy : 0.6206
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Dentist Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6831
Logistic Regression (ANOVA)
glm(`Dentist Check-up`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Dentist Check-up` ~ Age + Ethnicity + Gender +
Religion + Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.420616 0.333355 -4.262 2.03e-05 ***
Age 0.022557 0.003124 7.221 5.18e-13 ***
EthnicityAsian Indian -0.687213 0.276542 -2.485 0.012954 *
EthnicityFilipino -0.323056 0.218459 -1.479 0.139195
EthnicityKorean -0.658031 0.158064 -4.163 3.14e-05 ***
EthnicityOther -0.378098 0.234347 -1.613 0.106655
EthnicityVietnamese -0.265212 0.169321 -1.566 0.117273
GenderMale -0.555174 0.097153 -5.714 1.10e-08 ***
ReligionCatholic -0.085299 0.180831 -0.472 0.637139
ReligionHindu -0.841902 0.295985 -2.844 0.004449 **
ReligionMuslim -0.591290 0.357829 -1.652 0.098445 .
ReligionNone -0.223659 0.182589 -1.225 0.220600
ReligionOther -0.827481 0.366753 -2.256 0.024056 *
ReligionProtestant -0.299644 0.183141 -1.636 0.101810
Income_medianAbove 0.859564 0.101234 8.491 < 2e-16 ***
EmploymentEmployed full time 0.331433 0.100561 3.296 0.000981 ***
EnglishSpeakNot well 1.017252 0.238818 4.260 2.05e-05 ***
EnglishSpeakVery well 1.869347 0.256245 7.295 2.98e-13 ***
EnglishSpeakWell 1.294431 0.240885 5.374 7.72e-08 ***
EnglishDiffMuch -0.085002 0.153919 -0.552 0.580777
EnglishDiffNot much -0.155011 0.143920 -1.077 0.281452
EnglishDiffVery much -0.367683 0.139112 -2.643 0.008216 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3128.8 on 2304 degrees of freedom
Residual deviance: 2775.6 on 2283 degrees of freedom
AIC: 2819.6
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Dentist Check-up
LR Chisq Df Pr(>Chisq)
Age 53.872 1 2.140e-13 ***
Ethnicity 20.219 5 0.0011367 **
Gender 33.092 1 8.792e-09 ***
Religion 12.224 6 0.0571474 .
Income_median 74.013 1 < 2.2e-16 ***
Employment 10.912 1 0.0009556 ***
EnglishSpeak 62.717 3 1.544e-13 ***
EnglishDiff 7.472 3 0.0582818 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.265631 1 1.125003
Ethnicity 14.459364 5 1.306216
Gender 1.128714 1 1.062409
Religion 12.632464 6 1.235352
Income_median 1.218138 1 1.103693
Employment 1.200260 1 1.095564
EnglishSpeak 2.432475 3 1.159689
EnglishDiff 1.785636 3 1.101452
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Dentist Check-up` ~ Age + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Model 2: `Dentist Check-up` ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2288 2795.8
2 2283 2775.6 5 20.219 0.001137 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2829.781 2819.561
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2927.409 2945.904
Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Folkmedicine`=="Yes")
neg <- rfdata |> filter(`Folkmedicine`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Folkmedicine`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Folkmedicine ~ Age + Gender + Religion + Income_median + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.67%
Confusion matrix:
0 Yes class.error
0 1369 3 0.002186589
Yes 214 1 0.995348837
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1372 187
Yes 0 28
Accuracy : 0.8822
95% CI : (0.8653, 0.8976)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 0.02036
Kappa : 0.2057
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.13023
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.88005
Prevalence : 0.13548
Detection Rate : 0.01764
Detection Prevalence : 0.01764
Balanced Accuracy : 0.56512
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 600 97
Yes 0 0
Accuracy : 0.8608
95% CI : (0.8329, 0.8857)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.527
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8608
Prevalence : 0.1392
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Folkmedicine`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Folkmedicine`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Folkmedicine ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.74%
Confusion matrix:
0 Yes class.error
0 1366 6 0.004373178
Yes 212 3 0.986046512
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1372 151
Yes 0 64
Accuracy : 0.9049
95% CI : (0.8893, 0.9188)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 5.528e-07
Kappa : 0.4229
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.29767
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.90085
Prevalence : 0.13548
Detection Rate : 0.04033
Detection Prevalence : 0.04033
Balanced Accuracy : 0.64884
'Positive' Class : Yes
Test Set
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 599 96
Yes 1 1
Accuracy : 0.8608
95% CI : (0.8329, 0.8857)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.527
Kappa : 0.0147
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.010309
Specificity : 0.998333
Pos Pred Value : 0.500000
Neg Pred Value : 0.861871
Prevalence : 0.139168
Detection Rate : 0.001435
Detection Prevalence : 0.002869
Balanced Accuracy : 0.504321
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Folkmedicine`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6505
pROC::auc(rocobj_wo)Area under the curve: 0.6324
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 15.677593 6.0677957 17.886860 30.80462
Age 17.149669 11.4607503 20.466491 80.91698
Gender 2.203892 5.5019256 4.370974 10.92695
Religion 7.669136 2.1518765 8.434272 29.86329
Employment 12.751245 3.9668282 14.131463 10.88055
Income_median 11.039447 6.7729612 13.230817 11.99100
EnglishSpeak 13.849619 0.3284309 13.900533 23.83759
EnglishDiff 8.239005 3.7963037 9.421784 24.95166
Accuracy slightly decreased when ethnicity is added to the model.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Folkmedicine`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1371 214
Yes 1 1
Accuracy : 0.8645
95% CI : (0.8467, 0.881)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 0.5182
Kappa : 0.0067
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0046512
Specificity : 0.9992711
Pos Pred Value : 0.5000000
Neg Pred Value : 0.8649842
Prevalence : 0.1354757
Detection Rate : 0.0006301
Detection Prevalence : 0.0012602
Balanced Accuracy : 0.5019611
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 599 97
Yes 1 0
Accuracy : 0.8594
95% CI : (0.8314, 0.8844)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.5702
Kappa : -0.0028
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.000000
Specificity : 0.998333
Pos Pred Value : 0.000000
Neg Pred Value : 0.860632
Prevalence : 0.139168
Detection Rate : 0.000000
Detection Prevalence : 0.001435
Balanced Accuracy : 0.499167
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Folkmedicine`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6263
With ethnicity
Training Set
mod1 <- glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1371 213
Yes 1 2
Accuracy : 0.8652
95% CI : (0.8474, 0.8816)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 0.4889
Kappa : 0.0147
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.009302
Specificity : 0.999271
Pos Pred Value : 0.666667
Neg Pred Value : 0.865530
Prevalence : 0.135476
Detection Rate : 0.001260
Detection Prevalence : 0.001890
Balanced Accuracy : 0.504287
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 596 97
Yes 4 0
Accuracy : 0.8551
95% CI : (0.8267, 0.8804)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.6923
Kappa : -0.0111
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.000000
Specificity : 0.993333
Pos Pred Value : 0.000000
Neg Pred Value : 0.860029
Prevalence : 0.139168
Detection Rate : 0.000000
Detection Prevalence : 0.005739
Balanced Accuracy : 0.496667
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Folkmedicine`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.64
pROC::auc(rocobj_wo)Area under the curve: 0.6263
Logistic Regression (ANOVA)
glm(`Folkmedicine`~Age+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income_median +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Folkmedicine ~ Age + Ethnicity + Gender + Religion +
Income_median + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.981039 0.418722 -4.731 2.23e-06 ***
Age 0.023752 0.004107 5.783 7.35e-09 ***
EthnicityAsian Indian -0.163409 0.428106 -0.382 0.70268
EthnicityFilipino -0.593820 0.305210 -1.946 0.05170 .
EthnicityKorean 0.221605 0.184995 1.198 0.23096
EthnicityOther -0.412059 0.327426 -1.258 0.20822
EthnicityVietnamese -1.022804 0.240866 -4.246 2.17e-05 ***
GenderMale -0.246577 0.134174 -1.838 0.06610 .
ReligionCatholic -0.306849 0.259785 -1.181 0.23754
ReligionHindu -0.729681 0.468087 -1.559 0.11903
ReligionMuslim -2.770535 1.074730 -2.578 0.00994 **
ReligionNone -0.280427 0.236332 -1.187 0.23539
ReligionOther 0.203293 0.499046 0.407 0.68374
ReligionProtestant -0.175201 0.231205 -0.758 0.44859
Income_medianAbove 0.122587 0.137438 0.892 0.37242
EmploymentEmployed full time -0.151345 0.140684 -1.076 0.28203
EnglishSpeakNot well -0.138418 0.271866 -0.509 0.61065
EnglishSpeakVery well -0.461749 0.305648 -1.511 0.13086
EnglishSpeakWell -0.102828 0.276832 -0.371 0.71031
EnglishDiffMuch -0.030922 0.211613 -0.146 0.88382
EnglishDiffNot much -0.098880 0.197842 -0.500 0.61722
EnglishDiffVery much -0.124122 0.208871 -0.594 0.55234
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1821.5 on 2283 degrees of freedom
Residual deviance: 1676.7 on 2262 degrees of freedom
AIC: 1720.7
Number of Fisher Scoring iterations: 6
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Folkmedicine
LR Chisq Df Pr(>Chisq)
Age 33.609 1 6.739e-09 ***
Ethnicity 29.655 5 1.725e-05 ***
Gender 3.406 1 0.06496 .
Religion 14.644 6 0.02322 *
Income_median 0.796 1 0.37219
Employment 1.162 1 0.28103
EnglishSpeak 4.090 3 0.25192
EnglishDiff 0.533 3 0.91152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.201208 1 1.095996
Ethnicity 12.995193 5 1.292344
Gender 1.088221 1 1.043178
Religion 11.248192 6 1.223461
Income_median 1.186867 1 1.089434
Employment 1.181656 1 1.087040
EnglishSpeak 2.420539 3 1.158738
EnglishDiff 1.807754 3 1.103714
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Folkmedicine ~ Age + Gender + Religion + Income_median + Employment +
EnglishSpeak + EnglishDiff
Model 2: Folkmedicine ~ Age + Ethnicity + Gender + Religion + Income_median +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2267 1706.3
2 2262 1676.7 5 29.655 1.725e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 1740.355 1720.7
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 1837.828 1846.841